Setup

Load packages

library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.border = element_rect(colour = "black", fill = NA),
  text = element_text(size = 20),
  strip.placement = "outside",
  strip.text = element_text(size = 20, color = "black"),
  strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(GenomicRanges)
library(openxlsx)
require(ggraph)
require(igraph)
source("scripts/functions.R")
source("../motif-analysis/mta_downstream_functions.R")
source("../gene-set-analysis/gene-set-analysis.R")

Data directories and metadata

atac_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "results"
)
cns_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "consensus_peaks"
)
bnd_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "footprint", "BINDetect"
)
rna_dir <- file.path(
  "RNASEQ_QUANTIFICATION", "results"
)
pub_dir <- file.path("published")
res_dir <- file.path("results")
fig_dir <- file.path("plots")
for (newdir in c(res_dir, fig_dir))
  dir.create(newdir, showWarnings = FALSE)

# metadata
des_fn <- file.path("RNASEQ_QUANTIFICATION", "design_table.tsv")
des_dt <- fread(des_fn)
setnames(des_dt, "reporter_line", "reporterline")
col_dt <- des_dt[, .(sample, reporterline, condition)]
col_dt[, reporterline := factor(
  reporterline,
  levels = c("Elav", "Fox", "Ncol")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
setorder(col_dt, reporterline, condition)

# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
  "Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
  "Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)
euler_cols <- c(
  line_cond_cols[grep("pos", names(line_cond_cols))],
  c(
    "Elav_pos&Ncol_pos" = "#ffce8c",
    "Elav_pos&Fox_pos" = "#f1b6ba",
    "Fox_pos&Ncol_pos" = "#daffcb",
    "Elav_pos&Fox_pos&Ncol_pos" = "#fffdfe"
  )
)

Gene annotations and lists

marks_gold <- fread(
  file.path("annotation", "golden-marks-231124.tsv"),
  fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)

tfs_annt <- fread(
  file.path("annotation", "tfs.Nvec.tsv"),
  header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))

gene_annt <- fread(
  file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
  select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))

gene_ann <- rbindlist(list(
  gene_annt[!gene %in% tfs_annt$gene],
  tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]

Load normalized gene expression (RNA-seq) and motif accessibility (ATAC-seq) data

# GENES

# size norm gene expression
exps_mt <- read.table(
  file.path(rna_dir, "mat_norm.tsv")
)

# size and gene norm gene expression
expr_mt <- read.table(
  file.path(rna_dir, "norm_expression.tsv")
)

# gene expression TPMs
tpms_mt <- read.table(
  file.path(rna_dir, "tpm_expression.tsv")
)

# gene clusters
gene_clust <- fread(
  file.path(rna_dir, "genes_clusters.tsv")
)

# PEAKS

# peak accessibility
accs_mt <- read.table(
  file.path(atac_dir, "mat_norm.tsv")
)

# peak to gene assignment
assign <- fread(
  file.path(atac_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)

# peak clusters
pks_clust <- fread(
  file.path(atac_dir, "consensusSeekeR-peaks-clusters.bed")
)
bed_cols <- c(
  "seqnames", "start", "end", "peak", "score", "strand", "cluster", "group"
)
setnames(pks_clust, bed_cols)

# MOTIFS

# motif enrichment scores
motf_dt <- fread(
  file.path(atac_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(motf_dt, "label", "group")

# motif enrichment qvalue
motf_qv <- read.table(
  file.path(atac_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
  sep = "\t"
)

# motif enrichment log2fc
motf_fc <- read.table(
  file.path(atac_dir, "motif-enrich-archetypes-mona-fc.tsv"),
  sep = "\t"
)

# motif to tf assignment
dc <- fread(file.path(
  "annotation",
  "assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]

Correlation between motif binding and gene expression

Load fold changes for expression and motif binding

# binding comparisons fold changes for best correlated motifs
dp_dt <- fread(file.path(
  bnd_dir, "bindetect_results_reporterline.txt"
))
dp_dt[, id := paste(reporterline, vs, sep = "_vs_")]
dp_dt[, motif_gene := paste(motif, gene, sep = "__")]
dt_atac <- dcast.data.table(
  dp_dt, motif_gene ~ id, value.var = "log2FoldChange"
)
mt_atac <- as.matrix(dt_atac[, -1])
rownames(mt_atac) <- dt_atac$motif_gene

# RNA comparisons fold changes
mt_rna <- read.table(
  file.path(rna_dir, "log2fc_expression.tsv"),
  sep = "\t",
  header = TRUE
)

Dot plot of expression and binding score

dt_rna_plot <- melt.data.table(
  as.data.table(mt_rna, keep.rownames = "gene"),
  id.vars = "gene", variable.name = "comparison", value.name = "rna_fc"
)
dt_atac_plot <- melt.data.table(
  as.data.table(mt_atac, keep.rownames = "motif_gene"),
  id.vars = "motif_gene", variable.name = "comparison", value.name = "atac_fc"
)
dt_atac_plot[, gene := str_remove(motif_gene, "ARCH\\d+__")]
dt_atac_plot[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_comb <- merge.data.table(
  dt_rna_plot, dt_atac_plot,
  by = c("gene", "comparison")
)

# filtering
ids_rna <- rownames(mt_rna)[apply(mt_rna, 1, function(x) max(x) > 1.5)]
ids_atac <- rownames(mt_atac)[apply(mt_atac, 1, function(x) max(x) > 0.1)]
dt_plot <- dt_comb[gene %in% ids_rna & motif_gene %in% ids_atac]

# add gene annotation
dt_plot <- merge.data.table(
  dt_plot, gene_ann, by = "gene",
  all.x = TRUE, sort = FALSE
)

# label for plotting
dt_plot[, label := paste(
  substr(name, 1, 30), gene, motif,
  sep = " | "
)]

# clustering
tfs <- unique(dt_plot$gene)
hc <- hclust(dist(mt_rna[tfs, ]), method = "ward.D2")
ord <- tfs[hc$order]

# ordering
tfs <- unique(dt_plot$gene)
mord <- order(apply(mt_rna[tfs,], 1, which.max))
ord <- tfs[mord]

# order genes
dt_plot[, gene := factor(gene, levels = rev(ord))]
setorder(dt_plot, gene)
dt_plot[, label := factor(label, levels = unique(dt_plot$label))]
setorder(dt_plot, label)

# limits for plotting
dt_plot[rna_fc < 0, rna_fc := 0]
dt_plot[rna_fc > 8, rna_fc := 8]

# plot
require(ggplot2)
require(scales)
## Loading required package: scales
gp_dot <- ggplot(dt_plot, aes(comparison, label)) +
  geom_point(
    aes(size = rna_fc, fill = atac_fc),
    shape = 21,
    color = "black"
  ) +
  scale_fill_gradientn(
    colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
    # limits = c(-5, 5), oob = squish, breaks = c(-5, 0, 5),
    name = "motif binding\nlog2(fold change)",
  ) +
  scale_size_continuous(
    # range = c(1, 7),
    # breaks = c(0, 2, 4, 6),
    name = "gene expression\nlog2(fold change)"
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
    legend.direction = "vertical"
  )
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gp_dot

Select correlated genes

mt_bnd <- copy(mt_atac)
rownames(mt_bnd) <- str_remove(rownames(mt_bnd),  "ARCH\\d+__")
mt_exp <- mt_rna[rownames(mt_bnd), colnames(mt_bnd)]
rownames(mt_bnd) <- rownames(mt_atac)
rownames(mt_exp) <- rownames(mt_atac)

# correlation between same comparisons (columns)
mt_cors_smp <- cor(mt_bnd, mt_exp, method = "spearman")
diag(mt_cors_smp)
##  Elav_vs_Fox Elav_vs_Ncol  Elav_vs_neg  Fox_vs_Elav  Fox_vs_Ncol   Fox_vs_neg 
##   0.17200287   0.07699029   0.29388865   0.17200287   0.18450950   0.18314571 
## Ncol_vs_Elav  Ncol_vs_Fox  Ncol_vs_neg 
##   0.07699029   0.18450950   0.12235286
# correlation between genes (rows)
mt_cors_gen <- cor(t(mt_bnd), t(mt_exp), method = "spearman")
cors_gen <- diag(mt_cors_gen)
names(cors_gen) <- rownames(mt_cors_gen)
cors_gen <- sort(cors_gen, decreasing = TRUE)

# select correlated genes
select_pos <- names(cors_gen[(cors_gen) > 0.5])

# correlation distribution
dt_cors <- data.table(cor = cors_gen)
dt_cors$motif_gene <- names(cors_gen)
dt_cors[, correlation := "no"]
dt_cors[motif_gene %in% select_pos, correlation := "positive"]
dt_cors[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_cors[, gene := str_remove(motif_gene, "ARCH\\d+__")]

# plot
gp_hist <- ggplot(dt_cors, aes(cor, fill = correlation)) +
  geom_histogram(color = "black", binwidth = 0.05) +
  scale_y_continuous(expand = expansion(c(0, 0.1))) +
  scale_fill_manual(values = c(
    "positive" = "green",
    "no" = "grey"
  )) +
  labs(x = "correlation", y = "number of genes") +
  theme(legend.position = "none")
gp_hist

Inspect correlations between gene expression and motif binding score per sample

mt_ord <- cbind(mt_exp, mt_bnd)
colnames(mt_ord) <- paste(
  colnames(mt_ord),
  rep(c("RNA", "ATAC"), each = 9),
  sep = "_"
)
mt_dt <- as.data.table(mt_ord, keep.rownames = "motif_gene")
mt_dt[, motif := str_extract(motif_gene, "ARCH\\d+")]
mt_dt[, gene := str_remove(motif_gene, "ARCH\\d+__")]
mt_dt[, motif_gene := NULL]
dt_atac <- melt.data.table(
  mt_dt,
  id.vars = c("gene", "motif"),
  measure.vars = grep("ATAC", colnames(mt_dt), value = TRUE),
  value.name = "ATAC", variable.name = "comparison"
)
dt_atac[, comparison := str_remove(comparison, "_ATAC")]

dt_rna <- melt.data.table(
  mt_dt,
  id.vars = "gene",
  measure.vars = grep("RNA", colnames(mt_dt), value = TRUE),
  value.name = "RNA", variable.name = "comparison"
)
dt_rna[, comparison := str_remove(comparison, "_RNA")]

dt_comp <- merge.data.table(
  dt_rna, dt_atac,
  by = c("gene", "comparison"), all = TRUE
)
setcolorder(dt_comp, c("motif", "gene", "ATAC", "RNA"))
dt_comp[, reporterline := str_extract(comparison, "Elav|Fox|Ncol")]
dt_comp[, vs := str_extract(comparison, "vs.*")]
dt_comp[, vs := str_remove(vs, "vs_")]
dt_comp <- merge.data.table(
  dt_comp, dt_cors, by = c("motif", "gene"),
  all.x = TRUE, sort = FALSE
)

# add gene annotation
dt_comp <- merge.data.table(
  dt_comp, gene_ann, by = "gene",
  all.x = TRUE, sort = FALSE
)

# save
fwrite(
  dt_comp,
  file.path(res_dir, "correlation_expression_binding.tsv"),
  sep = "\t"
)

# label for plotting
dt_comp[, label := substr(name, 1, 20)]

# posterboys to label
dt_lab <- dt_comp[cor > 0.5 & abs(RNA) > 1 & abs(ATAC) > 0.1]

# plot
gp_comp <- ggplot(dt_comp, aes(RNA, ATAC, color = correlation)) +
  geom_point(data = dt_comp[correlation == "no"], alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_text_repel(
    data = dt_lab[ATAC > 0],
    aes(label = label),
    color = "black",
    size = 3,
    segment.color = "grey50",
    min.segment.length = 0,
    nudge_y = 0.5 - dt_lab[ATAC > 0]$ATAC,
  ) +
  geom_text_repel(
    data = dt_lab[ATAC < 0],
    aes(label = label),
    color = "black",
    size = 3,
    segment.color = "grey50",
    min.segment.length = 0,
    nudge_y = -0.5 - dt_lab[ATAC < 0]$ATAC,
  ) +
  geom_point(data = dt_comp[correlation != "no"], alpha = 0.8) +
  scale_color_manual(values = c(
    "positive" = "green",
    "no" = "grey"
  )) +
  scale_y_continuous(expand = c(0.1, 0.1)) +
  facet_grid(vs ~ reporterline) +
  theme(legend.position = "bottom")
gp_comp
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Inspect correlation of binding and expression per gene

# binding correlations with expression
dt_comp <- fread(
  file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp_cor <- unique(
  dt_comp[correlation != "no"][, .(motif, gene, reporterline)]
)

# for individual genes
comp_marks <- dt_comp[gene %in% marks_gold$gene]
comp_marks_gp <- ggplot(comp_marks, aes(ATAC, RNA, color = reporterline, shape = vs)) +
  scale_color_manual(values = line_cols) +
  scale_shape_manual(values = c("Elav" = 15, "Fox" = 16, "Ncol" = 17, "neg" = 18)) +
  ggpubr::stat_cor(aes(group = gene), method = "pearson", label.x = -0.5, label.y = 10, color = "black") +
  facet_wrap("name") +
  geom_hline(yintercept = 0, linetype = 2, size = 0.5) +
  geom_vline(xintercept = 0, linetype = 2, size = 0.5) +
  geom_point(size = 2) +
  theme(legend.position = "bottom") +
  labs(x = "motif binding logFC", y = "expression logFC")
comp_marks_gp

Gene networks

Load bound targets info, select those tfs and target genes that are also expressed in given sample, and add expression correlation info to construct baseline networks.

# binding correlations with expression
dt_comp <- fread(
  file.path(res_dir, "correlation_expression_binding.tsv")
)

# binding targets
mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
mo_dt <- mo_dt[expressed == TRUE & target_expressed == TRUE & bound == TRUE]
mo_dt <- mo_dt[grep("pos", sample)]
mo_dt[, reporterline := str_remove(sample, "_pos")]


# add TF-target gene expression correlation
g_cors_dt <- unique(mo_dt[, .(gene, target_gene)])
g_cors_dt <- g_cors_dt[gene %in% rownames(tpms_mt)]
g_cors_dt <- g_cors_dt[target_gene %in% rownames(tpms_mt)]
g_cors_dt[, expression_correlation := cor(
  unlist(tpms_mt[gene, ]),
  unlist(tpms_mt[target_gene, ])
), by = 1:nrow(g_cors_dt)]
mo_dt <- merge.data.table(
  mo_dt, g_cors_dt, by = c("gene", "target_gene"),
  all.x = TRUE, sort = FALSE
)
mo_dt <- mo_dt[!is.na(expression_correlation)]
mo_dt[, c("expressed", "target_expressed", "bound") := NULL]

# save
fwrite(
  mo_dt,
  file.path(res_dir, "network.tsv.gz"),
  sep = "\t"
)

Expression distribution

mo_dt <- fread(file.path(res_dir, "network.tsv.gz"))

# distribution of expression lfc
exp_tf_dt <- unique(mo_dt[, .(sample, gene, expression_lfc, expression_tpm)])
exp_tg_dt <- unique(mo_dt[, .(sample, target_gene, target_expression_lfc, target_expression_tpm)])
setnames(exp_tg_dt, colnames(exp_tg_dt), str_remove(colnames(exp_tg_dt), "target_"))

exp_dist_plot <- function(exp_dt, title = "Expression distribution", lfc_thr = 0) {
  gp_scat <- ggplot(
    exp_dt,
    aes(expression_lfc, expression_tpm, color = sample)
  ) + 
    geom_point(alpha = 0.8) +
    scale_color_manual(values = line_cond_cols) +
    scale_y_log10() +
    geom_vline(xintercept = lfc_thr, color = "red", linetype = 2) +
    theme(
      legend.position = "none",
      panel.grid.major = element_line(linewidth = 0.1)
    ) +
    labs(x = "expression logFC", y = "expression TPM")
  gp_dist_lfc <- ggplot(
    exp_dt,
    aes(sample, expression_lfc, fill = sample)
  ) +
    geom_boxplot(aes(color = sample)) +
    geom_boxplot(aes(fill = sample), outlier.colour = NA) +
    geom_hline(yintercept = lfc_thr, color = "red", linetype = 2) +
    scale_fill_manual(values = line_cond_cols) +
    scale_color_manual(values = line_cond_cols) +
    coord_flip() +
    theme(
      legend.position = "none",
      panel.grid.major = element_line(linewidth = 0.1),
      axis.title = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
  gp_dist_tpm <- ggplot(
    exp_dt,
    aes(sample, expression_tpm)
  ) +
    geom_boxplot(aes(color = sample)) +
    geom_boxplot(aes(fill = sample), outlier.colour = NA) +
    scale_fill_manual(values = line_cond_cols) +
    scale_color_manual(values = line_cond_cols) +
    scale_y_log10() +
    theme(
      legend.position = "none",
      panel.grid.major = element_line(linewidth = 0.1),
      axis.title = element_blank(),
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank()
    )
  
  gp_exp <- (gp_dist_lfc + plot_spacer() + gp_scat + gp_dist_tpm) + 
    plot_layout(
      nrow = 2, ncol = 2,
      heights = c(1, 3), widths = c(3, 1)
    ) +
    plot_annotation(title = title)
  gp_exp
}

gp_tf_exp <- exp_dist_plot(exp_tf_dt, title = "TFs expression")
gp_tf_exp

gp_tg_exp <- exp_dist_plot(exp_tg_dt, title = "Target genes expression")
gp_tg_exp

Filtering by expression

mo_dt <- mo_dt[expression_lfc > 0][target_expression_lfc > 0]
# mo_dt <- mo_dt[expression_tpm > 100][target_expression_tpm > 100]
setcolorder(mo_dt, c(
  "sample", "gene", "name", "pfam",
  "expression_tpm", "expression_lfc",
  "target_gene", "target_name", "target_pfam",
  "target_expression_tpm", "target_expression_lfc", 
  "motif", "motif_score", "footprint_score",
  "expression_correlation"
))

# save
fwrite(
  mo_dt,
  file.path(res_dir, "network_expression.tsv.gz"),
  sep = "\t"
)

Motif binding distribution

Footprint score distribution

mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
bnd_dt <- unique(
  mo_dt[expressed == TRUE & target_expressed == TRUE][, .(
    sample, gene, target_gene, motif, expression_lfc, footprint_score, bound
  )]
)
bnd_dt[, bound := ifelse(bound, "bound", "not bound")]
bnd_dt[, bound := factor(bound, levels = c("not bound", "bound"))]

gp_bnd_dist <- ggplot(bnd_dt, aes(footprint_score, color = sample, linetype = bound)) +
  geom_density(alpha = 0.1) +
  geom_vline(xintercept = 4, color = "red", linetype = 2) +
  scale_color_manual(values = line_cond_cols) +
  scale_linetype_manual(values = c("bound" = 1, "not bound" = 2)) +
  scale_fill_manual(values = c("bound" = "grey60", "not bound" = "grey100")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  scale_x_continuous(breaks = c(0, 4, seq(10, 100, 10)), expand = expansion(mult = c(0, 0)), limits = c(NA, 40), oob = scales::squish) +
  theme(
    legend.position = "bottom", legend.box = "vertical",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  )

gp_bnd_boxp <- ggplot(bnd_dt, aes(sample, footprint_score, fill = sample)) +
  geom_boxplot(aes(color = sample)) +
  geom_boxplot(aes(fill = sample), outlier.colour = NA) +
  scale_fill_manual(values = line_cond_cols) +
  scale_color_manual(values = line_cond_cols) +
  scale_y_continuous(trans = "log10") +
  facet_wrap("bound", scales = "free") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "none"
  )

gp_bnd_dist

Correlation of expression and motif binding

# select correlated motifs
dt_comp <- fread(
  file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp[, sample := paste(reporterline, "_pos")]
dt_comp_cor <- unique(
  dt_comp[correlation != "no"][, .(motif, gene, sample)]
)
mo_dt_cor <- merge.data.table(
  dt_comp_cor, mo_dt, by = c("motif", "gene", "sample"), sort = FALSE
)

# save
fwrite(
  mo_dt_cor,
  file.path(res_dir, "network_correlation.tsv.gz"),
  sep = "\t"
)

Filtered network

Save network files

mo_dt <- fread(file.path(res_dir, "network_expression.tsv.gz"))
mo_dt[, reporterline := NULL]
mo_dt[, motif_name := NULL]
mo_dt[, target_TF := ifelse(target_gene %in% .SD$gene, TRUE, FALSE), sample]
mo_dt[, name := substr(name, 1, 30)]
mo_dt[, pfam := substr(pfam, 1, 30)]
mo_dt[, target_name := substr(target_name, 1, 30)]
mo_dt[, target_pfam := substr(target_pfam, 1, 30)]

require(openxlsx)
wb <- createWorkbook()
for (smp in c("Elav", "Ncol", "Fox")) {
  smp_fn <- file.path(res_dir, sprintf("network_%s.tsv.gz", smp))
  smp_dt <- mo_dt[sample == paste0(smp, "_pos")]
  message("Saving: ", smp_fn)
  fwrite(smp_dt, smp_fn, sep = "\t")
  # all genes
  addWorksheet(wb, sheetName = smp)
  writeData(wb, sheet = smp, smp_dt)
  # TF-TF
  tfs_dt <- smp_dt[target_TF == TRUE]
  tfs_orph <- smp_dt[!gene %in% tfs_dt$gene, .SD[1], gene]
  tfs_orph[, target_gene := "none"]
  tfs_orph[, c("target_name", "target_pfam") := ""]
  tfs_orph[, c("target_expression_tpm", "target_expression_lfc", "expression_correlation") := 0]
  tfs_dt <- rbindlist(list(tfs_dt, tfs_orph), use.names = TRUE)
  addWorksheet(wb, sheetName = paste0(smp, "_TF"))
  writeData(wb, sheet = paste0(smp, "_TF"), tfs_dt)
}
## Saving: results/network_Elav.tsv.gz
## Saving: results/network_Ncol.tsv.gz
## Saving: results/network_Fox.tsv.gz
saveWorkbook(
    wb,
    file.path(res_dir, "network.xlsx"),
    overwrite = TRUE
)

How many target genes for each TF?

hits_dt <- mo_dt[, .N, .(gene, motif, sample)]
hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
  ggbeeswarm::geom_beeswarm(shape = 21) +
  geom_boxplot(outlier.color = NA, alpha = 0.2) +
  scale_y_continuous(trans = "log10") +
  scale_fill_manual(values = line_cond_cols) +
  labs(y = "target genes per TF") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "none",
    panel.grid.major.y = element_line(linewidth = 0.5),
    panel.grid.minor.y = element_line(linewidth = 0.2)
  ) +
  geom_text(
    data = hits_dt[,.N,sample],
    aes(label = N, y = 15)
  )
hits_gp

TFs overlapping between the lines

require(eulerr)
## Loading required package: eulerr
mo_dt <- fread(file.path(res_dir, "network_expression.tsv.gz"))
mo_dt[, gene_n_samples := length(unique(.SD$sample)), .(gene, name, pfam)]
gene_ovl_dt <- unique(mo_dt[, .(gene, sample, gene_n_samples)])
gene_ovl_dt <- dcast.data.table(gene_ovl_dt, gene ~ sample, value.var = "gene_n_samples")
gene_ovl_dt <- unique(gene_ovl_dt)
gene_ovl_mt <- as.matrix(gene_ovl_dt[, -1])
gene_ovl_mt[!is.na(gene_ovl_mt)] <- 1
gene_ovl_mt[is.na(gene_ovl_mt)] <- 0
fit <- euler(gene_ovl_mt)
nms <- names(fit$fitted.values)
p1 <- plot(
  fit,
  quantities = TRUE,
  fills = euler_cols[nms],
  main = "TFs overlap"
)
print(p1)

For all TFs, compare sets of targets between cell lines

## png 
##   2

For all target genes, compare sets of TFs that regulate them between cell lines (i.e. genes upregulated in two or three cell lines, but differently regulated)

mo_dt <- fread(file.path(res_dir, "network_expression.tsv.gz"))

# count number of cell lines where each tf - target gene is present
mo_dt[, n_samples_edge := length(unique(.SD$sample)), .(motif, target_gene)]
mo_dt[, n_samples_gene := length(unique(.SD$sample)), .(motif)]
mo_dt[, n_samples_target := length(unique(.SD$sample)), .(target_gene)]

# per target gene, the max number of samples in which any of its regulatory link is present
mo_dt[, n_samples_edge_max := max(.SD$n_samples_edge), .(target_gene)]

# of the genes that are expressed in more than one cell line, which ones are completely differently regulated?
mo_dif <- unique(
  mo_dt[, .(target_gene, target_name, target_pfam, n_samples_target, n_samples_edge_max)]
)[n_samples_target > 1 & n_samples_edge_max==1]
setorder(mo_dif, -n_samples_target)

# plot euler diagram
pdf(
  file.path(fig_dir, "target_genes_motifs_novl.pdf"),
  width = 7, height = 4
)
for (tg in unique(mo_dif$target_gene)) {
  message(tg)
  nm <- substr(gene_ann[gene == tg]$name, 1, 25)
  pf <- substr(gene_ann[gene == tg]$pfam, 1, 25)
  tg_dt <- unique(mo_dt[target_gene == tg][, .(motif, sample, motif_score)])
  tg_dt <- dcast.data.table(tg_dt, motif ~ sample, value.var = "motif_score")
  tg_dt <- unique(tg_dt)
  tg_mt <- as.matrix(tg_dt[, -1])
  tg_mt[!is.na(tg_mt)] <- 1
  tg_mt[is.na(tg_mt)] <- 0
  fit <- euler(tg_mt)
  nms <- names(fit$fitted.values)
  p1 <- plot(
    fit,
    quantities = TRUE,
    fills = euler_cols[nms],
    main = sprintf("\n\n%s\n%s; %s", tg, nm, pf)
  )
  print(p1)

}
dev.off()

# plot heatmap
pdf(
  file.path(fig_dir, "target_genes_motifs_novl_heatmap.pdf"),
  width = 12, height = 12
)
for (tg in unique(mo_dif$target_gene)) {
  message(tg)
  nm <- substr(gene_ann[gene == tg]$name, 1, 35)
  pf <- substr(gene_ann[gene == tg]$pfam, 1, 35)
  tg_dt <- mo_dt[target_gene==tg]
  setorder(tg_dt, sample)
  tg_dt[, label := paste(motif, str_remove(gene, "Nvec_(vc1.1_)*"), substr(name, 1, 35), sep = " | ")]
  tg_dt[, label := factor(label, levels = unique(label))]
  tg_gp <- ggplot(tg_dt, aes(sample, label, fill = footprint_score, size = expression_lfc)) +
    #geom_tile(color = "white") +
    geom_point(shape = 21) +
    scale_size_continuous(range = c(1, 16)) +
    scale_y_discrete(position = "right") +
    scale_fill_distiller(palette = "RdPu", direction = 1) +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      panel.grid.major = element_line(color = "grey", linewidth = 0.2)
    ) +
    labs(
      title = sprintf("\n\n%s\n%s; %s", tg, nm, pf),
      y = "", x = ""
    )
  print(tg_gp)
}
dev.off()

# of the genes that are expressed in more than one cell line, which ones are partially differently regulated?
mo_par <- unique(
  mo_dt[, .(target_gene, target_name, target_pfam, n_samples_target, n_samples_edge_max)]
)[n_samples_target > 1 & n_samples_edge_max == 3]
setorder(mo_par, -n_samples_target)

# plot euler diagram
pdf(
  file.path(fig_dir, "target_genes_motifs_povl.pdf"),
  width = 7, height = 4
)
for (tg in unique(mo_par$target_gene)) {
  message(tg)
  nm <- substr(gene_ann[gene == tg]$name, 1, 25)
  pf <- substr(gene_ann[gene == tg]$pfam, 1, 25)
  tg_dt <- unique(mo_dt[target_gene == tg][, .(motif, sample, motif_score)])
  tg_dt <- dcast.data.table(tg_dt, motif ~ sample, value.var = "motif_score")
  tg_dt <- unique(tg_dt)
  tg_mt <- as.matrix(tg_dt[, -1])
  tg_mt[!is.na(tg_mt)] <- 1
  tg_mt[is.na(tg_mt)] <- 0
  fit <- euler(tg_mt)
  nms <- names(fit$fitted.values)
  p1 <- plot(
    fit,
    quantities = TRUE,
    fills = euler_cols[nms],
    main = sprintf("\n\n%s\n%s; %s", tg, nm, pf)
  )
  print(p1)

}
dev.off()

# plot heatmap
pdf(
  file.path(fig_dir, "target_genes_motifs_povl_heatmap.pdf"),
  width = 12, height = 16
)
for (tg in unique(mo_par$target_gene)) {
  message(tg)
  nm <- substr(gene_ann[gene == tg]$name, 1, 35)
  pf <- substr(gene_ann[gene == tg]$pfam, 1, 35)
  tg_dt <- mo_dt[target_gene==tg]
  setorder(tg_dt, sample)
  tg_dt[, label := paste(motif, str_remove(gene, "Nvec_(vc1.1_)*"), substr(name, 1, 35), sep = " | ")]
  tg_dt[, label := factor(label, levels = unique(label))]
  tg_gp <- ggplot(tg_dt, aes(sample, label, fill = footprint_score, size = expression_lfc)) +
    #geom_tile(color = "white") +
    geom_point(shape = 21) +
    scale_size_continuous(range = c(1, 16)) +
    scale_y_discrete(position = "right") +
    scale_fill_distiller(palette = "RdPu", direction = 1) +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      panel.grid.major = element_line(color = "grey", linewidth = 0.2)
    ) +
    labs(
      title = sprintf("\n\n%s\n%s; %s", tg, nm, pf),
      y = "", x = ""
    )
  print(tg_gp)
}
dev.off()

GO enrichment

# all networks
net <- fread(file.path(res_dir, "network_expression.tsv.gz"))
net[, reporterline := str_remove(sample, "_pos")]

# GO annotations
go_annotations <- readRDS(file.path("annotation", "Nvec_ensembl.GO.rds"))

# where to save results
go_dir <- file.path(res_dir, "GO")
dir.create(go_dir)

# enrichment test per reporter line
tfs_dt <- rbindlist(lapply(unique(net$reporterline), function(st) {

    message("\n", "Starting GO analysis for ", st, "\n")
    net_st <- net[reporterline == st]

    # enrichment test per TF
    tfs <- unique(net_st[, .N, gene][order(-N)][N > 10]$gene)
    tfs_dt <- rbindlist(lapply(tfs, function(tf) {
        genes_fg <- unique(net_st[gene == tf]$target_gene)
        genes_bg <- unique(net$target_gene)
        message(sprintf(
            "%s (%s/%s); %s target genes; %s background genes",
            tf, match(tf, tfs), length(tfs), length(genes_fg), length(genes_bg)
        ))
        go_enrichment_table_tf <- gsa_topgo_enrichment(
            annotation = go_annotations,
            genes_fg = genes_fg,
            genes_bg = genes_bg,
            output_prefix = file.path(go_dir, "GO"),
            name_fg = sprintf("%s_%s", st, tf),
            ontologyset = c("BP", "MF", "CC"),
            tg_test = "fisher",
            tg_algorithm = "elim",
            top_markers = 30,
            nodesize = 10,
            printfile = FALSE
        )
        go_tf <- as.data.table(go_enrichment_table_tf)
        go_tf[, gene := tf]

    }), fill = TRUE)
    tfs_dt[, reporterline := st]

}))

# save
saveRDS(tfs_dt, file.path(res_dir, "go_res_per_tf.rds"))

Plot results

# load PFAM enrichment results
tfs_dt <- readRDS(file.path(res_dir, "go_res_per_tf.rds"))
setnames(tfs_dt, "GO.ID", "go_id")
tfs_dt[, annot := paste(Term, go_id, sep = " | ")]

# add TF gene annotations
tfs_dt <- merge.data.table(
  tfs_dt, gene_ann, by = "gene", all.x = TRUE, sort = FALSE
)

for (x in c("Elav", "Ncol", "Fox")) {
  
  rep_dt <- tfs_dt[reporterline == x]
  
  # select significant categories
  anns <- unique(rep_dt[pval_test < 0.01]$annot)
  if (length(anns) > 50) 
    anns <- unique(rep_dt[, .(pval_test, annot)])[order(pval_test)]$annot[1:50]
  anns_dt <- rep_dt[annot %in% anns]
  
  # cluster categories and genes  
  anns_dc <- dcast.data.table(anns_dt, gene ~ annot, value.var = "pval_adj")
  anns_mt <- as.matrix(anns_dc[, -1])
  rownames(anns_mt) <- anns_dc[[1]]
  anns_mt[is.na(anns_mt)] <- 1
  anns_hc <- hclust(dist(anns_mt), method = "ward.D2")
  anns_mt <- anns_mt[anns_hc$order, ]
  anns_on <- unique(anns_dt[,.(annot, ontology)])[order(ontology)]
  anns_ord <- unlist(tapply(anns_on$annot, anns_on$ontology, function(x) {
    anns_hc <- hclust(dist(t(anns_mt[, x])), method = "ward.D2")
    x[anns_hc$order]
  }), use.names = FALSE)
  anns_ord <- as.character(anns_ord)
  anns_mt <- anns_mt[, anns_ord]
  
  # order
  anns_dt[, gene := factor(gene, levels = rownames(anns_mt))]
  anns_dt[, annot := factor(annot, levels = colnames(anns_mt))]
  setorder(anns_dt, gene, annot)
  anns_dt[, label := paste(substr(name, 1, 30), str_remove(gene, "Nvec_(vc1.1_)*"), sep = " | ")]
  anns_dt[, label := factor(label, levels = unique(anns_dt$label))]
  
  # plot
  gp_go <- ggplot(anns_dt, aes(label, annot, color = pval_test, size = Significant)) +
    #geom_tile(color = "white") +
    geom_point() +
    scale_size_continuous(range = c(1, 8)) +
    scale_color_distiller(palette = "BuPu") +
    facet_grid(ontology ~ ., scales = "free", space = "free") +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      panel.grid.major = element_line(linewidth = 0.2, color = "grey90")
    ) +
    labs(title = x)
  pw <- pmax(16, length(unique(anns_dt$label)) / 3)
  ph <- pmax(12, length(unique(anns_dt$annot)) / 2.5)
  pdf(
    file.path(fig_dir, sprintf("go_annot_%s.pdf", x)),
    width = pw, height = ph
  )
  print(gp_go)
  dev.off()
}

PFAM enrichment

# all networks
net <- fread(file.path(res_dir, "network_expression.tsv.gz"))
net[, reporterline := str_remove(sample, "_pos")]

# PFAM annotations
pfam_annotations <- gsa_enrichment_load_pfam_list(
  file.path("annotation", "Nvec_long.pep.pfamscan_archs.csv")
)

# where to save results
pfam_dir <- file.path(res_dir, "PFAM")
dir.create(pfam_dir)

# enrichment test per reporter line
tfs_dt <- rbindlist(lapply(unique(net$reporterline), function(st) {

    message("\n", "Starting PFAM analysis for ", st, "\n")
    net_st <- net[reporterline == st]

    # enrichment test per TF
    tfs <- unique(net_st[, .N, gene][order(-N)][N > 10]$gene)
    tfs_dt <- rbindlist(lapply(tfs, function(tf) {
        genes_fg <- unique(net_st[gene == tf]$target_gene)
        genes_bg <- unique(net$target_gene)
        message(sprintf(
            "%s (%s/%s); %s target genes; %s background genes",
            tf, match(tf, tfs), length(tfs), length(genes_fg), length(genes_bg)
        ))
        enrichment_table <- gsa_enrichment_hypergeometric(
            annotation = pfam_annotations,
            genes_fg = genes_fg,
            genes_bg = genes_bg,
            output_prefix = file.path(pfam_dir, "PFAM"),
            name_fg = sprintf("%s_%s", st, tf)
        )
        pfam_tf <- as.data.table(enrichment_table)
        pfam_tf[, gene := tf]

    }), fill = TRUE)
    tfs_dt[, reporterline := st]

}))

# save
saveRDS(tfs_dt, file.path(res_dir, "pfam_res_per_tf.rds"))

Plot results

# load PFAM enrichment results
tfs_dt <- readRDS(file.path(res_dir, "pfam_res_per_tf.rds"))

# add TF gene annotations
tfs_dt <- merge.data.table(
  tfs_dt, gene_ann, by = "gene", all.x = TRUE, sort = FALSE
)

for (x in c("Elav", "Ncol", "Fox")) {
  
  rep_dt <- tfs_dt[reporterline == x]
  
  # select significant categories
  anns <- rep_dt[pval_adj < 0.05]$annot
  anns_dt <- rep_dt[annot %in% anns]
  
  # cluster categories and genes
  anns_dc <- dcast.data.table(anns_dt, gene ~ annot, value.var = "pval_adj")
  anns_mt <- as.matrix(anns_dc[, -1])
  rownames(anns_mt) <- anns_dc[[1]]
  anns_mt[is.na(anns_mt)] <- 1
  anns_hc <- hclust(dist(anns_mt), method = "ward.D2")
  anns_mt <- anns_mt[anns_hc$order, ]
  anns_hc <- hclust(dist(t(anns_mt)), method = "ward.D2")
  anns_mt <- anns_mt[, anns_hc$order]
  
  # order
  anns_dt[, gene := factor(gene, levels = rownames(anns_mt))]
  anns_dt[, annot := factor(annot, levels = colnames(anns_mt))]
  setorder(anns_dt, gene, annot)
  anns_dt[, label := paste(substr(name, 1, 30), str_remove(gene, "Nvec_(vc1.1_)*"), sep = " | ")]
  anns_dt[, label := factor(label, levels = unique(anns_dt$label))]
  
  # plot
  gp_pfam <- ggplot(anns_dt, aes(label, annot, color = pval_adj, size = freq_in_fg)) +
    #geom_tile(color = "white") +
    geom_point() +
    scale_size_continuous(range = c(1, 8)) +
    scale_color_distiller(palette = "BuPu") +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      panel.grid.major = element_line(linewidth = 0.2, color = "grey90")
    ) +
    labs(title = x)
  pw <- pmax(12, length(unique(anns_dt$label)) / 4)
  ph <- pmax(12, length(unique(anns_dt$annot)) / 3)
  pdf(
    file.path(fig_dir, sprintf("pfam_annot_%s.pdf", x)),
    width = pw, height = ph
  )
  print(gp_pfam)
  dev.off()
}

Cnidocytes network

Comparison with Ozment et al. 2021 eLife paper.

Parse data from paper

# load data from paper and translate gene IDs to DToL IDs
pou_act <- readWorkbook(file.path(pub_dir, "elife-74336-supp6-v3.xlsx"), startRow = 2, colNames = TRUE)
pou_rep <- readWorkbook(file.path(pub_dir, "elife-74336-supp7-v3.xlsx"), startRow = 2, colNames = TRUE)
setDT(pou_act)
setDT(pou_rep)
pou_pub <- rbindlist(list(activated = pou_act, repressed = pou_rep), idcol = "regulation")
setnames(pou_pub, "gene", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_pub <- merge.data.table(pou_pub, dict, all.x = TRUE, sort = FALSE)
pou_pub <- pou_pub[, .(gene, regulation, fold_change, pvalue)]
setnames(pou_pub, "gene", "target_gene")
fwrite(
  pou_pub,
  file.path(pub_dir, "pou4_targets_elife.tsv"),
  sep = "\t"
)

Compare targets

# load data
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]

# load data from paper
pou_pub <- fread(file.path(pub_dir, "pou4_targets_elife.tsv"))

# overlap of Pou4 targets
pou_pub_act <- pou_pub[regulation == "activated", .(target_gene, fold_change)]
setnames(pou_pub_act, "fold_change", "activated")
pou_pub_rep <- pou_pub[regulation == "repressed", .(target_gene, fold_change)]
setnames(pou_pub_rep, "fold_change", "repressed")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- merge.data.table(merge.data.table(
  pou_pub_act, pou_pub_rep,
  by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE)
pou_mt <- pou_dt[, lapply(.SD, function(x) !is.na(x)), .SD = c("activated", "repressed", "network")]
pou_mt[, target_gene := pou_dt$target_gene]
setcolorder(pou_mt, "target_gene")
pou_mt <- unique(pou_mt)
pou_mt <- as.matrix(pou_mt[, -1])

require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
  fit,
  quantities = TRUE
)
print(p1)

Inspect the differences

# reverse activation and repression sign (mutants)
pou_dt[, published := -1 * activated][is.na(published), published := -1 * repressed]

# sets of target genes
missing_in_network <- unique(pou_dt[!is.na(published) & is.na(network)]$target_gene)
missing_in_network_act <- unique(pou_dt[!is.na(activated) & is.na(network)]$target_gene)
missing_in_network_rep <- unique(pou_dt[!is.na(repressed) & is.na(network)]$target_gene)
missing_in_published <- unique(pou_dt[is.na(published)]$target_gene)
target_intersect <- unique(pou_dt[!is.na(published) & !is.na(network)]$target_gene)
mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
target_bound <- unique(mo_dt[sample=="Ncol_pos" & gene == pou4]$target_gene)

# expression for target genes
pou_expression_dt <- unique(ncol_dt[, .(target_gene, target_expression_lfc, motif_score, footprint_score, expression_correlation)])
pou_expression_dt[target_gene %in% missing_in_network_act, annot := "activated"]
pou_expression_dt[target_gene %in% missing_in_network_rep, annot := "repressed"]
pou_expression_dt[target_gene %in% missing_in_published, annot := "network"]
pou_expression_dt[target_gene %in% target_intersect, annot := "intersect"]
pou_expression_dt <- pou_expression_dt[!is.na(annot)]
pou_expression_dt[, bound := ifelse(target_gene %in% target_bound, "binding", "no binding")]
# plot
pou_expression_dt[, annot := factor(
  annot, levels = c("network", "intersect", "activated", "repressed")
)]
gp_pou_exp <- ggplot(pou_expression_dt, aes(annot, target_expression_lfc, fill = bound)) +
  geom_boxplot() +
  # ggbeeswarm::geom_quasirandom() +
  geom_boxplot(outlier.color = NA, alpha = 0.5) +
  scale_fill_manual(values = c("grey40", "grey80")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(x = "target genes", y = "expression log FC")
gp_pou_exp

Genes in intersect

pou_int <- pou_net[target_gene %in% target_intersect]
pou_int <- unique(merge.data.table(
  pou_int, pou_pub[, .(target_gene, regulation)],
  by = "target_gene", all.x = TRUE, sort = FALSE
))
pou_int[, target_TF := target_gene %in% tfs_annt$gene]

# plot
gp_pou_int <- ggplot(pou_int, aes(
  expression_correlation, target_expression_lfc,
  #size = 1/n_samples,
  shape = target_TF,
  fill = regulation,
  label = target_name
)) +
  ggrepel::geom_text_repel(color = "black") +
  geom_point(color = "grey10") +
  scale_size_continuous(
    range = c(1, 6),
    breaks = seq(1/3, 1, length.out = 3),
    labels = seq(3, 1),
    name = "target in number of samples\n(Elav, Ncol, Fox)"
  ) +
  scale_shape_manual(
    values = c("TRUE" = 22, "FALSE" = 21),
    name = "target TF?"
  ) +
  scale_fill_manual(
    values = c("activated" = "grey90", "repressed" = "grey50"),
    name = "relation"
  ) +
  guides(
    fill = guide_legend(override.aes = list(size = 5, shape = 21)),
    shape = guide_legend(override.aes = list(size = 5)),
    size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
  ) +
  labs(
    x = "Pou4 expression correlation",
    y = "target gene expression logFC",
    title = sprintf("%s target genes in intersect", length(unique(pou_int$target_gene)))
  )
gp_pou_int
## Warning: ggrepel: 31 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Comparison with Tournière et al. 2020 Cell Reports

Parse data from the paper

# load data from paper and translate gene IDs to DToL IDs
pou_mut <- readWorkbook(file.path(pub_dir, "1-s2.0-S2211124720303454-mmc2.xlsx"), startRow = 1, colNames = TRUE)
setDT(pou_mut)
setnames(pou_mut, "ID", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_mut <- merge.data.table(pou_mut, dict, all.x = TRUE, sort = FALSE)
setnames(
  pou_mut,
  c("gene", "Regulation", "log2FoldChange"),
  c("target_gene", "regulation", "fold_change")
)
pou_mut[regulation == "UP", regulation := "up"]
pou_mut[regulation == "Down", regulation := "down"]
pou_mut <- pou_mut[, .(target_gene, regulation, fold_change, padj)]
fwrite(
  pou_mut,
  file.path(pub_dir, "pou4_targets_cellrep.tsv"),
  sep = "\t"
)

Compare targets

# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]

# load data from paper
pou_mut <- fread(file.path(pub_dir, "pou4_targets_cellrep.tsv"))

# overlap of Pou4 targets
pou_mut_act <- pou_mut[regulation == "up", .(target_gene, fold_change)]
setnames(pou_mut_act, "fold_change", "up")
pou_mut_rep <- pou_mut[regulation == "down", .(target_gene, fold_change)]
setnames(pou_mut_rep, "fold_change", "down")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- unique(merge.data.table(merge.data.table(
  pou_mut_act, pou_mut_rep,
  by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE))
pou_mt <- pou_dt[, lapply(.SD, function(x) !is.na(x)), .SD = c("down", "up", "network")]
pou_mt[, target_gene := pou_dt$target_gene]
setcolorder(pou_mt, "target_gene")
pou_mt <- unique(pou_mt)
pou_mt <- as.matrix(pou_mt[, -1])

require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
  fit,
  quantities = TRUE
)
print(p1)

Inspect the differences

# reverse activation and repression sign (mutants)
pou_dt[, published := -1 * up][is.na(published), published := -1 * down]

# sets of target genes
missing_in_network <- unique(pou_dt[!is.na(published) & is.na(network)]$target_gene)
missing_in_network_act <- unique(pou_dt[!is.na(down) & is.na(network)]$target_gene)
missing_in_network_rep <- unique(pou_dt[!is.na(up) & is.na(network)]$target_gene)
missing_in_published <- unique(pou_dt[is.na(published)]$target_gene)
target_intersect <- unique(pou_dt[!is.na(published) & !is.na(network)]$target_gene)
mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
target_bound <- unique(mo_dt[sample=="Ncol_pos" & gene == pou4]$target_gene)

# expression for target genes
pou_expression_dt <- unique(ncol_dt[, .(target_gene, target_expression_lfc)])
# pou_expression_dt[target_gene %in% missing_in_network, annot := "published"]
pou_expression_dt[target_gene %in% missing_in_network_act, annot := "down"]
pou_expression_dt[target_gene %in% missing_in_network_rep, annot := "up"]
pou_expression_dt[target_gene %in% missing_in_published, annot := "network"]
pou_expression_dt[target_gene %in% target_intersect, annot := "intersect"]
pou_expression_dt <- pou_expression_dt[!is.na(annot)]
pou_expression_dt[, bound := ifelse(target_gene %in% target_bound, "binding", "no binding")]

# plot
pou_expression_dt[, annot := factor(
  annot, levels = c("network", "intersect", "down", "up")
)]
gp_pou_exp <- ggplot(pou_expression_dt, aes(annot, target_expression_lfc, fill = bound)) +
  geom_boxplot() +
  ggbeeswarm::geom_quasirandom() +
  geom_boxplot(outlier.color = NA, alpha = 0.5) +
  scale_fill_manual(values = c("grey40", "grey80")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(x = "target genes", y = "expression log FC")

Genes in intersect

pou_int <- pou_net[target_gene %in% target_intersect]
pou_int <- unique(merge.data.table(
  pou_int, pou_mut[, .(target_gene, regulation)],
  by = "target_gene", all.x = TRUE, sort = FALSE
))
pou_int[, target_TF := target_gene %in% tfs_annt$gene]

# plot
gp_pou_int <- ggplot(pou_int, aes(
  expression_correlation, target_expression_lfc,
  #size = 1/n_samples,
  shape = target_TF,
  fill = regulation,
  label = target_name
)) +
  ggrepel::geom_text_repel(color = "black") +
  geom_point(color = "grey10") +
  scale_size_continuous(
    range = c(1, 6),
    breaks = seq(1/3, 1, length.out = 3),
    labels = seq(3, 1),
    name = "target in number of samples\n(Elav, Ncol, Fox)"
  ) +
  scale_shape_manual(
    values = c("TRUE" = 22, "FALSE" = 21),
    name = "target TF?"
  ) +
  scale_fill_manual(
    values = c("down" = "grey90", "up" = "grey50"),
    name = "regulation"
  ) +
  guides(
    fill = guide_legend(override.aes = list(size = 5, shape = 21)),
    shape = guide_legend(override.aes = list(size = 5)),
    size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
  ) +
  labs(
    x = "Pou4 expression correlation",
    y = "target gene expression logFC",
    title = sprintf("%s target genes in intersect", length(unique(pou_int$target_gene)))
  )
gp_pou_int
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Combined comparison

Overlaps

# load published data
pou_cellrep <- fread(file.path(pub_dir, "pou4_targets_cellrep.tsv"))
pou_elife <- fread(file.path(pub_dir, "pou4_targets_elife.tsv"))

# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]

# overlap of Pou4 targets
pou_dt <- unique(rbindlist(list(
  pou_cellrep[, .(target_gene, regulation)],
  pou_elife[, .(target_gene, regulation)],
  pou_net[, .(target_gene)][, regulation := "network"]
)))
pou_dc <- dcast.data.table(pou_dt, target_gene ~ regulation, value.var = "regulation")
pou_dc <- unique(pou_dc)
pou_mt <- pou_dc[, lapply(.SD, function(x) !is.na(x)), .SD = c("down", "up", "activated", "repressed", "network")]
pou_mt <- as.matrix(pou_mt)

pou_cols <- c(
  "down" = "#add8e6ff",
  "up" = "#f08080ff",
  "activated" = "#3949dbff",
  "repressed" = "#de2346ff",
  "network" = "#ffecb3ff"
)

require(eulerr)
set.seed(1950)
fit <- euler(pou_mt)
p1 <- plot(
  fit,
  quantities = TRUE,
  fills = pou_cols
)
print(p1)

Genes in intersect

pou_int <- pou_dc[!is.na(network) & (!is.na(activated) | !is.na(repressed)) & (!is.na(up) | !is.na(down))]
pou_int <- pou_net[target_gene %in% pou_int$target_gene]
pou_int <- unique(merge.data.table(
  pou_int, pou_elife[, .(target_gene, regulation)],
  by = "target_gene", all.x = TRUE, sort = FALSE
))
pou_int[, target_TF := target_gene %in% tfs_annt$gene]
fwrite(
  pou_int,
  file.path(res_dir, "pou4_target_genes_intersect.tsv"),
  sep = "\t"
)

# plot
gp_pou_int <- ggplot(pou_int, aes(
  expression_correlation, target_expression_lfc,
  #size = 1/n_samples,
  shape = target_TF,
  fill = regulation,
  label = target_name
)) +
  ggrepel::geom_text_repel(color = "black") +
  geom_point(color = "grey10") +
  scale_size_continuous(
    range = c(1, 6),
    breaks = seq(1/3, 1, length.out = 3),
    labels = seq(3, 1),
    name = "target in number of samples\n(Elav, Ncol, Fox)"
  ) +
  scale_shape_manual(
    values = c("TRUE" = 22, "FALSE" = 21),
    name = "target TF?"
  ) +
  scale_fill_manual(
    values = c("activated" = "grey90", "repressed" = "grey50"),
    name = "regulation"
  ) +
  guides(
    fill = guide_legend(override.aes = list(size = 5, shape = 21)),
    shape = guide_legend(override.aes = list(size = 5)),
    size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
  ) +
  labs(
    x = "Pou4 expression correlation",
    y = "target gene expression logFC",
    title = sprintf("%s target genes in intersect", length(unique(pou_int$target_gene)))
  )
gp_pou_int
## Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Difference

# all genes expressed in Ncol, even if they are not bound by Pou4
mo_dt <- fread(file.path(atac_dir, "motifs_genes_targets_long.tsv.gz"))
all_dt <- mo_dt[sample == "Ncol_pos"][gene == pou4]
oth_dt <- mo_dt[sample == "Ncol_pos"][!target_gene %in% pou_dt$target_gene][, .SD[1], .(target_gene)]
oth_dt[, c("gene", "name", "pfam", "motif", "motif_name", "motif_score", "expression_tpm", "expression_lfc", "expressed", "bound", "footprint_score") := NA]
all_dt <- rbindlist(list(all_dt, oth_dt), use.names = TRUE)

# categories of genes bound by Pou4
pou_dt <- unique(rbindlist(list(
  pou_cellrep[, .(target_gene, regulation)],
  pou_elife[, .(target_gene, regulation)],
  pou_net[, .(target_gene)][, regulation := "network"]
)))
pou_dt[, regulation := factor(regulation, levels = c("network", "up", "activated", "down", "repressed"))]
pou_dt[, regulation := paste(as.character(sort(unique(.SD$regulation))), collapse = "+"), target_gene]

# combine
all_dt <- merge.data.table(all_dt, pou_dt, by = "target_gene", all.y = TRUE)
all_dt[, sample := NULL]
sum_dt <- all_dt[,length(unique(.SD$target_gene)),regulation][order(-V1)][V1>1]
sub_dt <- all_dt[regulation %in% sum_dt$regulation]
sub_dt[, regulation := factor(regulation, levels = unique(sum_dt$regulation))]

# plot
pou_cols <- c(
  "network" = "#ffecb3ff",
  "network+activated+down" = "#b1b1dcff",
  "network+activated" = "#b1b1dcff",
  "network+down" = "#b1b1dcff",
  "activated+down" = "#475ee6ff",
  "activated" = "#475ee6ff",
  "down" = "#add8e6ff",
  "network+up" = "#fbbea6ff",
  "network+up+repressed" = "#fbbea6ff",
  "up" = "#f08080ff",
  "up+repressed" = "#e85962ff",
  "repressed" = "#de2346ff"
)
sub_dt[, regulation := factor(regulation, levels = names(pou_cols))]
sub_dt[target_expression_lfc < 0, target_expression_lfc := 0]
sub_dt[is.na(target_expression_lfc), target_expression_lfc := 0]
sub_dt[is.na(footprint_score), footprint_score := 0]
gp_pou_diff <- ggplot(sub_dt, aes(target_expression_lfc, footprint_score, color = regulation)) +
  geom_point() +
  scale_color_manual(values = pou_cols) +
  geom_hline(yintercept = 4, linetype = 2) +
  geom_vline(xintercept = 0, linetype = 2) +
  facet_wrap("regulation") +
  theme(legend.position = "bottom")
gp_pou_diff

How many non-network genes are bound by Pou4?

unique(sub_dt[regulation %in% c("activated", "down", "activated+down")][,.(regulation,target_gene,bound)])[,.N,bound]
##    bound   N
## 1:    NA 334
## 2: FALSE 145
## 3:  TRUE   7
unique(sub_dt[regulation %in% c("repressed", "up", "up+repressed")][,.(regulation,target_gene,bound)])[,.N,bound]
##    bound   N
## 1:    NA 510
## 2: FALSE  82
## 3:  TRUE   2

Cell type level expression for different geroups of Pou4 targets

## Loading required package: metacell
## Loading required package: tgconfig
## Loading required package: tgstat
## Loading required package: tglkmeans
## Warning in supportsMulticoreAndRStudio(...): [ONE-TIME WARNING] Forked
## processing ('multicore') is not supported when running R from RStudio because
## it is considered unstable. For more details, how to control forked processing
## or not, and how to silence this warning in future R sessions, see
## ?parallelly::supportsMulticore
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:Biostrings':
## 
##     collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:graph':
## 
##     union
## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: RColorBrewer
## Loading required package: rasterpdf
## 
## Attaching package: 'rasterpdf'
## The following object is masked from 'package:grDevices':
## 
##     dev.off
## Loading required package: vioplot
## Loading required package: sm
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:sm':
## 
##     muscle
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:patchwork':
## 
##     area
## initializing scdb to ../scRNAseq_nvec_v4/scdb/
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 2.3 GiB

Motifs